In [1]:
## Importing libraries
In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
In [3]:
## Reading dataset
In [4]:
df=pd.read_csv('ds_salaries.csv')
df.head()
Out[4]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2023 SE FT Principal Data Scientist 80000 EUR 85847 ES 100 ES L
1 2023 MI CT ML Engineer 30000 USD 30000 US 100 US S
2 2023 MI CT ML Engineer 25500 USD 25500 US 100 US S
3 2023 SE FT Data Scientist 175000 USD 175000 CA 100 CA M
4 2023 SE FT Data Scientist 120000 USD 120000 CA 100 CA M
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
In [6]:
df.shape #checking the dimension of dataset
Out[6]:
(3755, 11)
In [7]:
## Checking missing values
In [8]:
df.isnull().sum()
Out[8]:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
In [9]:
#removing unneccssary columns
df.drop(['salary','salary_currency'],axis=1,inplace=True)
In [10]:
df.duplicated() ## Checkking duplicate rows
Out[10]:
0       False
1       False
2       False
3       False
4       False
        ...  
3750    False
3751    False
3752    False
3753    False
3754    False
Length: 3755, dtype: bool
In [11]:
df[df.duplicated()] #Duplicate data output
Out[11]:
work_year experience_level employment_type job_title salary_in_usd employee_residence remote_ratio company_location company_size
115 2023 SE FT Data Scientist 150000 US 0 US M
123 2023 SE FT Analytics Engineer 289800 US 0 US M
153 2023 MI FT Data Engineer 100000 US 100 US M
154 2023 MI FT Data Engineer 70000 US 100 US M
160 2023 SE FT Data Engineer 115000 US 0 US M
... ... ... ... ... ... ... ... ... ...
3439 2022 MI FT Data Scientist 78000 US 100 US M
3440 2022 SE FT Data Engineer 135000 US 100 US M
3441 2022 SE FT Data Engineer 115000 US 100 US M
3586 2021 MI FT Data Engineer 200000 US 100 US L
3709 2021 MI FT Data Scientist 90734 DE 50 DE L

1171 rows × 9 columns

In [12]:
df.describe() #Descriptive analysis
Out[12]:
work_year salary_in_usd remote_ratio
count 3755.000000 3755.000000 3755.000000
mean 2022.373635 137570.389880 46.271638
std 0.691448 63055.625278 48.589050
min 2020.000000 5132.000000 0.000000
25% 2022.000000 95000.000000 0.000000
50% 2022.000000 135000.000000 0.000000
75% 2023.000000 175000.000000 100.000000
max 2023.000000 450000.000000 100.000000
In [13]:
df.columns
Out[13]:
Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary_in_usd', 'employee_residence', 'remote_ratio',
       'company_location', 'company_size'],
      dtype='object')
In [14]:
for methods in df.columns:
    print(methods,":",df[methods].nunique()) #Number of unique values in dataframe
work_year : 4
experience_level : 4
employment_type : 4
job_title : 93
salary_in_usd : 1035
employee_residence : 78
remote_ratio : 3
company_location : 72
company_size : 3
In [15]:
#Exploratory Data analysis
In [16]:
df['experience_level'].unique()
Out[16]:
array(['SE', 'MI', 'EN', 'EX'], dtype=object)

Here We can see there are 4 unique values which are:

  1. SE - Senior level/Expert
  2. MI - Medium level/Intermediate
  3. EN - Entry level
  4. EX - Executive level

In [17]:
df['employment_type'].unique()
Out[17]:
array(['FT', 'CT', 'FL', 'PT'], dtype=object)

It has 4 unique values which are :

  • FT - Full Time
  • CT - Contract
  • FL - Freelance
  • PT - Part Time

In [18]:
#Top 15 Job title
job_title=df['job_title'].value_counts().head(15)
job_title
Out[18]:
Data Engineer                 1040
Data Scientist                 840
Data Analyst                   612
Machine Learning Engineer      289
Analytics Engineer             103
Data Architect                 101
Research Scientist              82
Data Science Manager            58
Applied Scientist               58
Research Engineer               37
ML Engineer                     34
Data Manager                    29
Machine Learning Scientist      26
Data Science Consultant         24
Data Analytics Manager          22
Name: job_title, dtype: int64
In [19]:
x=df['job_title'].value_counts().head(15).index
In [20]:
y=df['job_title'].value_counts().head(15).values
In [21]:
fig=go.Figure(data=go.Bar(x=x,y=y))
fig.update_xaxes(title="Job title")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Top 15 job Title",width=550,
    height=600)
fig.show()
In [22]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary_in_usd       3755 non-null   int64 
 5   employee_residence  3755 non-null   object
 6   remote_ratio        3755 non-null   int64 
 7   company_location    3755 non-null   object
 8   company_size        3755 non-null   object
dtypes: int64(3), object(6)
memory usage: 264.1+ KB
In [23]:
##Average Salaries Based On Job Titles
df.groupby('job_title').mean()['salary_in_usd']
Out[23]:
job_title
3D Computer Vision Researcher     21352.250000
AI Developer                     136666.090909
AI Programmer                     55000.000000
AI Scientist                     110120.875000
Analytics Engineer               152368.631068
                                     ...      
Research Engineer                163108.378378
Research Scientist               161214.195122
Software Data Engineer            62510.000000
Staff Data Analyst                15000.000000
Staff Data Scientist             105000.000000
Name: salary_in_usd, Length: 93, dtype: float64
In [24]:
## Maximum Salary getting job role
df.groupby('job_title').mean()['salary_in_usd'].idxmax()
Out[24]:
'Data Science Tech Lead'
In [25]:
df.groupby('job_title').mean()['salary_in_usd'].max()
Out[25]:
375000.0
In [26]:
### Salaries based on Experience level
exp=df.groupby('experience_level').mean()['salary_in_usd']
exp
Out[26]:
experience_level
EN     78546.284375
EX    194930.929825
MI    104525.939130
SE    153051.071542
Name: salary_in_usd, dtype: float64
In [27]:
## Maximum Salary getting job role
df.groupby('experience_level').mean()['salary_in_usd'].idxmax()
Out[27]:
'EX'
In [28]:
df.groupby('experience_level').mean()['salary_in_usd'].max()
Out[28]:
194930.9298245614
In [29]:
X=df.groupby('experience_level').mean()['salary_in_usd'].index
In [30]:
Y=df.groupby('experience_level').mean()['salary_in_usd'].values
In [31]:
fig=go.Figure(data=go.Bar(x=X,y=Y))
fig.update_xaxes(title="Job role")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Job role acc. to experience",width=500,
    height=500)
fig.show()
In [32]:
##Salaries based on Employment Type
ET=df['salary_in_usd'].groupby(df['employment_type']).mean()
ET
Out[32]:
employment_type
CT    113446.900000
FL     51807.800000
FT    138314.199570
PT     39533.705882
Name: salary_in_usd, dtype: float64
In [33]:
X1=df['salary_in_usd'].groupby(df['employment_type']).mean().index
In [34]:
Y1=df['salary_in_usd'].groupby(df['employment_type']).mean().values
In [35]:
fig=go.Figure(data=go.Bar(x=X1,y=Y1))
fig.update_xaxes(title="Emplyement Type")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Employment type with salary",width=500,
    height=500)
fig.show()
In [36]:
##Average Salaries based on Year(2020-2023)
WY=df['salary_in_usd'].groupby(df['work_year']).mean()
WY
Out[36]:
work_year
2020     92302.631579
2021     94087.208696
2022    133338.620793
2023    149045.541176
Name: salary_in_usd, dtype: float64
In [37]:
X2=df['salary_in_usd'].groupby(df['work_year']).mean().index
Y2=df['salary_in_usd'].groupby(df['work_year']).mean().values
In [39]:
fig=go.Figure(data=go.Line(x=X2,y=Y2))
fig.update_xaxes(title="Emplyement Type")
fig.update_yaxes(title="Frequency")
fig.update_layout(title='Salaries based on Year(2020-23)',width=500,
    height=500)
fig.show()
In [ ]: